{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import dautil as dl\n",
    "import pandas as pd\n",
    "from scipy import stats\n",
    "import numpy as np\n",
    "import math\n",
    "from sklearn.utils import check_random_state\n",
    "import matplotlib.pyplot as plt\n",
    "from IPython.display import HTML\n",
    "from IPython.display import display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "wb = dl.data.Worldbank()\n",
    "indicator = wb.get_name('co2')\n",
    "co2 = wb.download(country='NL', indicator=indicator, start=1900,\n",
    "                  end=2014)\n",
    "co2.index = [int(year) for year in co2.index.get_level_values(1)]\n",
    "temp = pd.DataFrame(dl.data.Weather.load()['TEMP'].resample('A'))\n",
    "temp.index = temp.index.year\n",
    "temp.index.name = 'year'\n",
    "df = pd.merge(co2, temp, left_index=True, right_index=True).dropna()\n",
    "\n",
    "stats_corr = stats.pearsonr(df[indicator].values, df['TEMP'].values)\n",
    "print('Correlation={0:.4g}, p-value={1:.4g}'.format(stats_corr[0], stats_corr[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "lr = dl.nb.LatexRenderer(chapter=3, start=13)\n",
    "lr.render(r'r = r_{xy} =\\frac{\\sum ^n _{i=1}(x_i - \\bar{x})(y_i - \\bar{y})}{\\sqrt{\\sum ^n _{i=1}(x_i - \\bar{x})^2} \\sqrt{\\sum ^n _{i=1}(y_i - \\bar{y})^2}}')\n",
    "lr.render(r'F(r) = {1 \\over 2}\\ln{1 + r \\over 1 - r} = \\operatorname{arctanh}(r)')\n",
    "lr.render(r'\\text{SE} = \\frac{1}{\\sqrt{n - 3}}')\n",
    "lr.render(r'z = \\frac{x - \\text{mean}}{\\text{SE}} = [F(r) - F(\\rho_0)]\\sqrt{n - 3}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "z = np.arctanh(stats_corr[0])\n",
    "n = len(df.index)\n",
    "se = 1/(math.sqrt(n - 3))\n",
    "ci = z + np.array([-1, 1]) * se * stats.norm.ppf((1 + 0.95)/2)\n",
    "\n",
    "ci = np.tanh(ci)\n",
    "dl.options.set_pd_options()\n",
    "ci_table = dl.report.DFBuilder(['Low', 'High'])\n",
    "ci_table.row([ci[0], ci[1]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "rs = check_random_state(34)\n",
    "\n",
    "ranges = []\n",
    "\n",
    "for j in range(200):\n",
    "    corrs = []\n",
    "\n",
    "    for i in range(100):\n",
    "        indices = rs.choice(n, size=n)\n",
    "        pairs = df.values\n",
    "        gen_pairs = pairs[indices]\n",
    "        corrs.append(stats.pearsonr(gen_pairs.T[0], gen_pairs.T[1])[0])\n",
    "\n",
    "    ranges.append(dl.stats.ci(corrs))\n",
    "\n",
    "ranges = np.array(ranges)\n",
    "bootstrap_ci = dl.stats.ci(corrs)\n",
    "ci_table.row([bootstrap_ci[0], bootstrap_ci[1]])\n",
    "ci_table = ci_table.build(index=['Formula', 'Bootstrap'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "dl.options.mimic_seaborn()\n",
    "context = dl.nb.Context('correlating_pearson')\n",
    "dl.nb.RcWidget(context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x = np.arange(len(ranges)) * 100\n",
    "plt.plot(x, ranges.T[0], label='Low')\n",
    "plt.plot(x, ranges.T[1], label='High')\n",
    "plt.plot(x, stats_corr[0] * np.ones_like(x), label='SciPy estimate')\n",
    "plt.ylabel('Pearson Correlation')\n",
    "plt.xlabel('Number of bootstraps')\n",
    "plt.title('Bootstrapped Pearson Correlation')\n",
    "plt.legend(loc='best')\n",
    "result = dl.report.HTMLBuilder()\n",
    "result.h1('Pearson Correlation Confidence intervals')\n",
    "result.h2('Confidence Intervals')\n",
    "result.add(ci_table.to_html())\n",
    "HTML(result.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
